We want to look at real datasets to simulate realistic datasets.

data("allen")

prefilter <- allen[grep("^ERCC-", rownames(allen), invert = TRUE),
                    which(colData(allen)$Core.Type=="Core")]
filter <- apply(assay(prefilter) > 10, 1, sum) >= 10

postfilter <- prefilter[filter,]
bio <- as.factor(colData(prefilter)$driver_1_s)
cl <- as.factor(colData(prefilter)$Primary.Type)
postfilter <- assay(postfilter)

vars <- rowVars(log1p(postfilter))
names(vars) <- rownames(postfilter)
vars <- sort(vars, decreasing = TRUE)
core <- postfilter[names(vars)[1:1000],]
detection_rate <- colSums(core>0)
coverage <- colSums(core)

We will color-code the cells by either known cell type or by inferred cluster (inferred in the original study). Select cells, remove ERCC spike-ins, filter out the genes that do not have at least 10 counts in at least 10 cells. Number of retained genes:

print(sum(filter))
## [1] 11545
dim(core)
## [1] 1000  285

To speed up the computations, I will focus on the top 1,000 most variable genes. IS IT A GOOD IDEA?

mean = apply(log1p(assay(prefilter)), 1, function(x) mean(x[x!=0]))
plot(mean, rowMeans(assay(prefilter) == 0), xlim = c(1,11), ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              main = 'no filtering')

pal =colorRampPalette(c("black",'black',"red","yellow"), space="rgb")
par(mfrow = c(1,3))
smoothScatter(mean, rowMeans(assay(prefilter) == 0), xlim = c(1,11),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp =pal, main = 'no filtering')

mean = apply(log1p(postfilter), 1, function(x) mean(x[x!=0]))
smoothScatter(mean, rowMeans(postfilter == 0),xlim = c(1,11),
              nbin = 256, nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = 'after filtering')

mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
smoothScatter(mean, rowMeans(core == 0),xlim = c(1,11),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = '1,000 most variable genes')

Fit data with K = 2, V and X intercepts in both Mu and Pi, and commondispersion = FALSE.

print(system.time(zinb <- zinbFit(core, ncores = 3, K = 2,
                                  commondispersion = FALSE)))
##    user  system elapsed 
## 367.510 260.190 244.365

True W

If we simulate W from real data, it will look like that.

par(mfrow=c(1, 2))
plot(zinb@W, col=cols[bio], pch=19, xlab="W1", ylab="W2", main="color = known cell-type")
#legend("bottomright", levels(bio), fill=cols)

plot(zinb@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2", main="color = inferred clusters")

#legend("topright", levels(cl), fill=cols2)

Gamma

gamma_mu = zinb@gamma_mu[1,]
gamma_pi = zinb@gamma_pi[1,]

df <- data.frame(gamma_mu = gamma_mu,
                 gamma_pi = gamma_pi,
                 detection_rate = detection_rate,
                 coverage = coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                  gamma_mu   gamma_pi detection_rate   coverage
## gamma_mu        1.0000000 -0.3267906      0.3799835  0.8543153
## gamma_pi       -0.3267906  1.0000000     -0.9919305 -0.4795440
## detection_rate  0.3799835 -0.9919305      1.0000000  0.5275845
## coverage        0.8543153 -0.4795440      0.5275845  1.0000000
gamma = data.frame(gamma_mu = gamma_mu, gamma_pi = gamma_pi, celltype = bio)
gamma = melt(gamma)

ggplot(gamma, aes(x = value)) + 
  geom_histogram(aes(y = ..density..), bins = 20, col = 'gray') +
  geom_density(col= 'blue', size = .5) + 
  facet_grid(~ variable) + xlab('gamma')

ggplot(gamma, aes(x = variable, y = value)) + xlab('') + theme_bw() +
  geom_boxplot() + coord_flip() + facet_grid(~ variable, scales = 'free') +
  theme_bw() + ylab('gamma') + scale_x_discrete(breaks = c('', ''), drop=FALSE)

ggplot(gamma, aes(value, fill = celltype)) + geom_density(alpha = 0.2) +
  facet_grid(~ variable) + xlab('gamma') + theme_bw()

Beta

beta_mu = zinb@beta_mu[1,]
beta_pi = zinb@beta_pi[1,]

df <- data.frame(beta_mu = beta_mu,
                 beta_pi = beta_pi)
pairs(df, pch=19)

print(cor(df, method="spearman"))
##            beta_mu    beta_pi
## beta_mu  1.0000000 -0.3743153
## beta_pi -0.3743153  1.0000000
beta = data.frame(beta_mu = beta_mu, beta_pi = beta_pi)
beta = melt(beta)
ggplot(beta, aes(x = value)) + 
  geom_histogram(aes(y = ..density..), bins = 100, col = 'gray') +
  geom_density(col= 'blue', size = .5) + 
  facet_grid(~ variable, scales = 'free') + xlab('beta')

ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
  geom_boxplot() + facet_grid(~ variable, scales = 'free') + coord_flip() +
  theme_bw() + ylab('beta') + scale_x_discrete(breaks = c('', ''), drop=FALSE)

# remove outliers
max = max(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
min = min(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
  geom_boxplot(outlier.shape = NA) +
  facet_grid(~ variable, scales = 'free') + coord_flip() +
  theme_bw() + ylab('beta removing outliers') +
  scale_x_discrete(breaks = c('', ''), drop=FALSE) +
  scale_y_continuous(limits = c(min, max))
## Warning: Removed 1042 rows containing non-finite values (stat_boxplot).

Alpha

par(mfrow=c(1,2))
plot(t(zinb@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")

df <- data.frame(alpha_mu_1 = zinb@alpha_mu[1, ],
                 alpha_mu_2 = zinb@alpha_mu[2, ],
                 alpha_pi_1 = zinb@alpha_pi[1, ],
                 alpha_pi_2 = zinb@alpha_pi[2, ])
pairs(df, pch=19)

print(cor(df, method="spearman"))
##             alpha_mu_1  alpha_mu_2  alpha_pi_1  alpha_pi_2
## alpha_mu_1  1.00000000 -0.09287074 -0.62039554  0.04799110
## alpha_mu_2 -0.09287074  1.00000000  0.02244009 -0.57466371
## alpha_pi_1 -0.62039554  0.02244009  1.00000000  0.04448713
## alpha_pi_2  0.04799110 -0.57466371  0.04448713  1.00000000

Dispersion

par(mfrow=c(1,1))
set = newSeqExpressionSet(core)
fq = betweenLaneNormalization(set, which = "full", offset = T)
disp = estimateDisp(counts(fq), offset = -offst(fq))
## Design matrix not provided. Switch to the classic mode.
plot(disp$tagwise.dispersion, 1/exp(zinb@zeta), ylab = 'zinb dispersion',
     xlab = 'edgeR tagwise dispersion', main = 'Dispersion')

print(cor(disp$tagwise.dispersion, 1/exp(zinb@zeta), method="spearman"))
## [1] 0.4793014
par(mfrow = c(1, 2))
plot(density(1/exp(zinb@zeta)), main = 'zinb dispersion')
plot(density(disp$tagwise.dispersion), main = 'edgeR dispersion')

par(mfrow = c(1, 2))
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
plot(mean, 1/exp(zinb@zeta), main = 'zinb', xlab = 'mean expression', ylab = 'dispersion', ylim = c(0, 14))
plot(mean, disp$tagwise.dispersion, main = 'edgeR',xlab = 'mean expression', ylab = 'dispersion', ylim = c(0, 14))

Estimated mu and pi

par(mfrow=c(1, 2))
detection_rate <- colMeans(core>0)
coverage <- colSums(core)
plot(rowMeans(getPi(zinb)), detection_rate, xlab="Average estimated Pi", ylab="Detection Rate for each cell", pch=19, col=cols[bio], ylim = c(0, 1))
plot(rowMeans(log1p(getMu(zinb))), log1p(coverage), xlab="Average estimated log Mu", ylab="log Coverage", pch=19, col=cols[bio])

par(mfrow=c(1, 2))
smoothScatter(mean, rowMeans(core == 0),xlim = c(2,8),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = 'Observed')
smoothScatter(colMeans(log1p(getMu(zinb))), colMeans(getPi(zinb)), nbin = 256,
              nrpoints=Inf, pch="", cex=.7, xlim = c(2,8),
              xlab = "Estimated Mean Expression", main = 'Fitted',
              ylab = "Estimated Dropout Rate",ylim = c(0,1),
              colramp = pal)

mdplot(matrix(c(colMeans(log1p(getMu(zinb))), mean), ncol = 2),
       main = 'MD plot - mu')
abline(h=0, col = 'red')
mdplot(matrix(c(colMeans(getPi(zinb)), rowMeans(core == 0)), ncol = 2),
       main = 'MD plot - pi')
abline(h=0, col = 'red')